-- Calls TSystem.dms__pdf__task__update_volltext asynchron via dblink_dblink_send_query
-- makes sure, only 1 async call is running
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__update_volltext_async(
    IN a_root            varchar,                  -- a DMS root directory, with subfolders for every year and there subfolders for every month
    IN a_restart_forced  boolean DEFAULT false,    -- when 'creating' a new task must be true, it will then overwrite already existing progress status
    IN a_restart_ifdone  boolean DEFAULT false,    -- if no progress status found, create new and start
    IN a_abortcond_paths integer DEFAULT -1,       -- ignore how many paths were processed
    IN a_abortcond_files integer DEFAULT -1        -- after processing this many files, stop
  )
  RETURNS void
  LANGUAGE plpgsql
  AS
  $$
  DECLARE
    _src_id               varchar; -- who is running this, either from DBU or from APPS(DaylyDBFunc)
    _ctx_id               varchar; -- log ctx suffix / identy

    _dblink_flag          varchar; -- async query was started
    _dblink_name          varchar; -- async connection name
    _dblink_cfg           varchar; -- async connection config

    _sql                  varchar; -- query to execute

  BEGIN
    _src_id      := TSystem.IfThen(current_user = 'APPS', 'APPS', 'DBU');
    _ctx_id      := 'dms__pdf__task__update_volltext';

    _dblink_name := FORMAT('%s::%s', _src_id, _ctx_id);
    _dblink_cfg  := TSystem.dblink__connectionstring__get();
    _dblink_flag := FORMAT('dblink::%s', _ctx_id);

    _sql :=
      FORMAT
      (
        $DOEXECUTE$
        DO
        $DO$
        DECLARE
          _logid    bigint;
          _numfiles integer;
          _folders  varchar;
        BEGIN
          -- give caller time to finish out (end transaction and commit)
          PERFORM PG_SLEEP(1);

          IF (NOT(TSystem.dms__pdf__task__init_python_helper(true))) THEN
            RAISE EXCEPTION 'Failed to initialize python helper instance %%', 'dummy';
          END IF;

          BEGIN
            --loglevel  tlog.dblog_loglevel
            --func_name varchar
            --ctx       varchar
            --msg       varchar
            --parent_id bigint
            --logtype   tlog.dblog_logtype
            --logsource tlog.dblog_logsource

            _logid    := TSystem.LogMessage(%1$L, %2$L, %3$L, FORMAT($FMT$%5$s (Params: %%L %%L %%L %%L %%L)$FMT$, %9$L, %10$L, %11$L, %12$L, %13$L), NULL, %4$L);
            _numfiles := TSystem.dms__pdf__task__update_volltext(%9$L, %10$L::boolean, %11$L::boolean, %12$s, %13$s);
            _folders  := TSystem.Settings__GetText($STR$dms__pdf__task$STR$, $STR$no folders left$STR$);

            PERFORM TSystem.LogMessage(%1$L, %2$L, %3$L, concat(%6$L, _numfiles),         _logid, %4$L);
            PERFORM TSystem.LogMessage(%1$L, %2$L, %3$L, concat(%7$L, chr(10), _folders), _logid, %4$L);

            PERFORM TSystem.Settings__Set(%8$L, false);
          EXCEPTION
            WHEN OTHERS THEN
              -- we cannot really check if (without blocking, in case its still running) a async query is done,
              -- only that an async query was startet within a connection and that the result were not yet requested
              -- use this flag to indicate we are done(no async query)
              PERFORM TSystem.Settings__Set(%8$L, false);
              RAISE; -- re-raise original exception
          END;
        END
        $DO$;
        $DOEXECUTE$,
        'pllInfo',                        -- (1) log.loglevel
        _dblink_name,                     -- (2) log.func_name
        _ctx_id,                          -- (3) log.ctx
        'pltProfiling',                   -- (4) log.logtype

        'started',                        -- (5) log.msg: start
        'processed files ',               -- (6) log.msg: processed
        'folders still to be processed ', -- (7) log.msg: left

        _dblink_flag,                     -- (8) async query started, not done
        a_root,                           -- (9)
        a_restart_forced,                 -- (10)
        a_restart_ifdone,                 -- (11)
        a_abortcond_paths,                -- (12)
        a_abortcond_files                 -- (13)
      )
    ;

    -- we cannot really check if (without blocking, in case its still running) a async query is done,
    -- only that an async query was startet within a connection and that the result were not yet requested
    -- use this flag to indicate we are done(no async query)
    IF (TSystem.Settings__GetBool(_dblink_flag, false)) THEN
      RAISE EXCEPTION 'Flag(Settings) % = true. Either async query (dblink: DBU|APPS::%) is still running or flag wasnt properly reset.', _dblink_flag, _ctx_id;
    END IF;

    IF _dblink_name = ANY(dblink_get_connections()) THEN
      PERFORM dblink_disconnect(_dblink_name);
    END IF;
    PERFORM dblink_connect_u(_dblink_name, _dblink_cfg);

    PERFORM dblink_exec(_dblink_name, 'SET statement_timeout=3600000', true); -- 1 Stunde
    PERFORM TSystem.Settings__Set(_dblink_flag, true);
    PERFORM dblink_send_query(_dblink_name, _sql);
  END
  $$;

--------------------------------------------------------------------------------
-- wrapper for Settings__Set for: _dms__pdf__task__python_helper (SEE TSystem.dms__pdf__task__init_python_helper)
--   # calling dblink via plpy.execute does not work
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__settings_set(a_name varchar, a_value varchar, a_dblink varchar) RETURNS boolean AS
  $$
  DECLARE
    _sql varchar;
    _result boolean;

  BEGIN
    _sql := Format('SELECT True FROM TSystem.Settings__SetText(%L, %L);', a_name, a_value);
    SELECT dummy INTO _result FROM dblink(a_dblink, _sql) AS t1(dummy boolean);

    RETURN _result;
  END;
  $$ LANGUAGE plpgsql;

--------------------------------------------------------------------------------
-- session lock for: dms__pdf__task (SEE TSystem.dms__pdf__task__init_python_helper)
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__trylock() RETURNS boolean AS
  $$
  DECLARE
    _lock_id bigint;

  BEGIN
    _lock_id := TSystem.string_to_int_hash('dms__pdf__task');

    RETURN pg_try_advisory_lock(_lock_id);
  END;
  $$ LANGUAGE plpgsql;

--------------------------------------------------------------------------------
-- session lock for: dms__pdf__task (SEE TSystem.dms__pdf__task__init_python_helper)
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__unlock() RETURNS boolean AS
  $$
  DECLARE
    _lock_id bigint;

  BEGIN
    _lock_id := TSystem.string_to_int_hash('dms__pdf__task');

    RETURN pg_advisory_unlock(_lock_id);
  END;
  $$ LANGUAGE plpgsql;

--------------------------------------------------------------------------------
-- creates an instance of python class _dms__pdf__task__python_helper in GD
--   # useable throughout the whole session (GD)
--   # this allows to reuse python code from different python/plpython3u functions
--   # provides better performance,
--     because no need to call a plpython3u function via plpy.execute
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__init_python_helper(a_force boolean DEFAULT false) RETURNS boolean AS
  $$
    import traceback
    from typing import Tuple
    from typing import Generator
    from pypdf import PdfReader

    Tuple_StatusHintVal = Tuple[bool, str, str]

    if (a_force or ('dms__pdf__task__init_python_helper' not in GD)):
      # -- #####################################################################
      # -- #####################################################################
      class _dms__pdf__task__python_helper:
        status_name: str
        exception_hint: str
        current_pid: str
        current_app_name: str
        current_client_addr: str
        current_inet_server_port: str
        current_database: str
        current_user: str
        curret_session_user: str

        # -- ###################################################################
        def __init__(self) -> None:
          self.status_name = 'dms__pdf__task'
          self.exception_hint = 'exception in class: _dms__pdf__task__python_helper (call TSystem.dms__pdf__task__init_python_helper(true) within session to recreate instance)';

          _sql = (
            "SELECT "
            "  pg_backend_pid()::varchar AS current_pid, "
            "  current_setting('application_name')::varchar AS current_app_name, "
            "  inet_client_addr()::varchar AS current_inet_client_addr, "
            "  inet_server_port()::varchar AS current_inet_server_port, "
            "  current_database()::varchar AS current_database, "
            "  current_user::varchar AS current_user, "
            "  session_user::varchar AS curret_session_user"
            ";"
          )
          _ret = plpy.execute(_sql);
          self.current_pid              = _ret[0]["current_pid"]
          self.current_app_name         = _ret[0]["current_app_name"]
          self.current_inet_client_addr = _ret[0]["current_inet_client_addr"]
          self.current_inet_server_port = _ret[0]["current_inet_server_port"]
          self.current_database         = _ret[0]["current_database"]
          self.current_user             = _ret[0]["current_user"]
          self.curret_session_user      = _ret[0]["curret_session_user"]

        # -- ###################################################################
        def _check(self) -> bool:
          return True; # -- function which is called(manually) after instancing, to make sure instance is created

        # -- ###################################################################
        def notice(self, a_msg: str) -> None:
          plpy.notice(a_msg)

        # -- ###################################################################
        def get_dblink_cfg(self) -> str:
          if (self.current_user == 'docker'):
            return f"dbname={self.current_database} host=localhost port={self.current_inet_server_port} user=syncro password=syncro"
          else:
            _ret = plpy.execute("SELECT tsystem.dblink__connectionstring__get() AS dblink_cfg;")
            return _ret[0]["dblink_cfg"]

        # -- ###################################################################
        def trylock(self) -> bool:

          import time

          # -- pg_advisory_lock does not offer a waittime parameter -> 3 attempts to get a lock
          for _index in range(1, 3):
            _ret = plpy.execute("SELECT TSystem.dms__pdf__task__trylock() AS locked, pg_current_xact_id_if_assigned()::varchar as xact_id;")
            if (_ret[0]["locked"]):
              break
            self.notice("trylock wait")
            time.sleep(0.2)

          return _ret[0]["locked"]

        # -- ###################################################################
        def unlock(self) -> bool:
          _ret = plpy.execute("SELECT TSystem.dms__pdf__task__unlock() AS unlocked, pg_current_xact_id_if_assigned()::varchar as xact_id;")
          return _ret[0]["unlocked"]

        # -- ###################################################################
        # -- no lock used here, instead use trylock/unlock at callers site appropriately
        def get_status(self) -> str:
          # --_sql = f"SELECT TSystem.Settings__GetText() AS status;"
          _plan = plpy.prepare('SELECT TSystem.Settings__GetText($1::varchar) AS status;', ["varchar"])
          _ret = plpy.execute(_plan, [self.status_name])
          return _ret[0]["status"]

        # -- ###################################################################
        # -- no lock used here, instead use trylock/unlock at callers site appropriately
        def set_status(self, a_status: str) -> None:
          _sql = "SELECT * FROM TSystem.dms__pdf__task__settings_set($1::varchar, $2::varchar, $3::varchar);"
          _plan = plpy.prepare(_sql, ["varchar", "varchar", "varchar"])
          plpy.execute(_plan, [self.status_name, a_status, self.get_dblink_cfg()])

        # -- ###################################################################
        # -- scans a directory for all sub-directories
        # --   # direct sub-/subsub-folders are ignored considering a_root should be a dms root
        # --     because: <dms-root>\<year>\<month>
        def get_folders(self, a_root: str, a_recursionlvl: int) -> Generator:

          import os

          with os.scandir(a_root) as _scan:
            for _item in _scan:
              if _item.is_file():
                pass
              else:
                if (a_recursionlvl >= 1):
                  yield _item.path
                yield from self.get_folders(_item.path, a_recursionlvl + 1)

        # -- ###################################################################
        # -- dms__pdf__task: retrieves the next folder to check for pdf-files
        def get_next_folder_to_process(self) -> str:

          import time
          import json

          if (self.trylock()):
            try:
              _folders_json_str = self.get_status()
              if (not(_folders_json_str == '')):
                _folders = json.loads(_folders_json_str)
                if (len(_folders) > 0):
                  _folder = _folders.pop(0)
                  _folders_json_str = json.dumps(_folders)
                  self.set_status(_folders_json_str)
                  return _folder
                else:
                  self.notice('no folders')
                  return ''
              else:
                self.notice('no folders')
                return ''
            except Exception as e:
              raise e
            finally:
              self.unlock()
          else:
            self.notice('no lock')
            return ''

        # -- ###################################################################
        # -- retrieves all .pdf files (full path/name) from a directory
        def get_pdffiles(self, a_directory: str) -> Generator:

          import os

          with os.scandir(a_directory) as _scan:
            for _item in _scan:
              if _item.is_file():
                if _item.path.lower().endswith('.pdf'):
                  yield _item.path

        # -- ###################################################################
        # -- extract the text from a pdf(PdfReader)
        def extract_text_from_pdf(self, a_pdf: PdfReader) -> Tuple_StatusHintVal:

          from pypdf import PdfReader

          try:
            _pages = []

            for _page in a_pdf.pages:
              _page_text = _page.extract_text() or ""
              _pages.append(_page_text.strip())

            _pdf_text = "\n".join(dict.fromkeys(_pages))
            return (True, '', _pdf_text)

          except Exception as e:
            _hint = "Exception while processing PdfReader"
            self.notice(f"{_hint}: {e}")
            _errinfo = "".join([traceback.format_exc(), _hint, '\n', self.exception_hint])
            return (False, _errinfo, '')

        # -- ###################################################################
        # -- extract the text from a pdf(full path/name)
        def extract_text_from_pdf_filename(self, a_filename: str) -> Tuple_StatusHintVal:

          from pypdf import PdfReader

          try:
            _pdfreader = PdfReader(a_filename)

            return self.extract_text_from_pdf(_pdfreader)

          except Exception as e:
            _hint = "Exception while processing file with PdfReader"
            self.notice(f"{_hint} {a_filename}: {e}")
            _errinfo = "".join([traceback.format_exc(), _hint, '\n', self.exception_hint])
            return (False, _errinfo, '')

        # -- ###################################################################
        # -- extract the text from a pdf(blob)
        def extract_text_from_pdf_bytea(self, a_bytes) -> Tuple_StatusHintVal:

          from pypdf import PdfReader
          from io import BytesIO

          try:
            _pdfreader = PdfReader(BytesIO(a_bytes))

            return self.extract_text_from_pdf(_pdfreader)

          except Exception as e:
            _hint = "Exception while processing bytes with PdfReader"
            self.notice(f"{_hint}: {e}")
            _errinfo = "".join([traceback.format_exc(), _hint, '\n', self.exception_hint])
            return (False, _errinfo, '')

      # -- #####################################################################
      # -- instantiate _dms__pdf__task__python_helper and make available via GD
      GD['dms__pdf__task__init_python_helper'] = _dms__pdf__task__python_helper()

    # -- call the _check() function to make sure instance is available
    return GD['dms__pdf__task__init_python_helper']._check()
  $$ LANGUAGE plpython3u;

--------------------------------------------------------------------------------
-- extracts the text from a pdf-file
--   # uses GD['dms__pdf__task__init_python_helper']
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__file__extract_text(
    IN a_filename varchar,

    OUT o_status      boolean, -- status of the operation
    OUT o_status_hint text,    -- hint regarding status
    OUT o_file_path   varchar, -- full path to file
    OUT o_file_name   varchar, -- base name of file (without - .pdf - extension)
    OUT o_file_text   text     -- text extracted from file
  )
  RETURNS SETOF RECORD AS
  $$
    import os
    from pathlib import Path

    if ('dms__pdf__task__init_python_helper' not in GD):
      plpy.execute("SELECT TSystem.dms__pdf__task__init_python_helper();")
    _helper = GD['dms__pdf__task__init_python_helper']

    # -- path, name(no extension), text, status
    if not os.path.isfile(a_filename):
      _hint = f"File not found: {a_filename}"
      plpy.notice(_hint)
      yield (False, _hint, os.path.dirname(a_filename), Path(a_filename).stem, None)
    else:
      _result = _helper.extract_text_from_pdf_filename(a_filename)
      yield (_result[0], _result[1], os.path.dirname(a_filename), Path(a_filename).stem, _result[2])
  $$ LANGUAGE plpython3u;

--------------------------------------------------------------------------------
-- extracts the text from a pdf-blob
--   # uses GD['dms__pdf__task__init_python_helper']
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__extract_text(
    IN a_pdf bytea,

    OUT o_status      boolean, -- status of the operation
    OUT o_status_hint text,    -- hint regarding status
    OUT o_file_text   text     -- text extracted
  )
  RETURNS SETOF RECORD AS
  $$
    if ('dms__pdf__task__init_python_helper' not in GD):
      plpy.execute("SELECT TSystem.dms__pdf__task__init_python_helper();")
    _helper = GD['dms__pdf__task__init_python_helper']

    yield _helper.extract_text_from_pdf_bytea(a_pdf)
  $$ LANGUAGE plpython3u;

--------------------------------------------------------------------------------
-- extracts the text from a pdf-blob
-- and writes it back to picndoku.pd_volltext / pd_volltext_status
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__update_volltext_with(
    IN a_pd_id    integer,
    IN a_filedata bytea,

    OUT o_status      boolean, -- status of the operation
    OUT o_status_hint text     -- hint regarding status
  )
  RETURNS SETOF RECORD AS
  $$
  DECLARE
    _id       integer DEFAULT null;
    _success  boolean DEFAULT null;
    _opmsg    text    DEFAULT null;
    _volltext text    DEFAULT null;

  BEGIN
    o_status      := true;
    o_status_hint := null;

    IF (a_filedata IS NULL) THEN
      o_status      := false;
      o_status_hint := concat_ws(', ', o_status_hint, 'no filedata (param)');
    END IF;

    IF (a_pd_id IS NULL) THEN
      o_status      := false;
      o_status_hint := concat_ws(', ', o_status_hint, 'no pd_id (param)');
    END IF;

    IF (o_status IS NOT TRUE) THEN
      RETURN NEXT;
      RETURN;
    END IF;


    SELECT
      _fnc.o_status,
      _fnc.o_status_hint,
      _fnc.o_file_text
    FROM
      TSystem.dms__pdf__extract_text(a_filedata) AS _fnc
    INTO
      _success,
      _opmsg,
      _volltext
    ;

    IF (_success) THEN
      UPDATE picndoku SET pd_volltext_status = null, pd_volltext = _volltext WHERE pd_id = a_pd_id RETURNING pd_id INTO _id;
    ELSE
      UPDATE picndoku SET pd_volltext_status = jsonb_build_object('status', _success, 'hint', _opmsg) WHERE pd_id = a_pd_id RETURNING pd_id INTO _id;
    END IF;

    IF ( coalesce(_id, (a_pd_id * -1)::integer) = a_pd_id ) THEN
      o_status      := _success;
      o_status_hint := _opmsg;
    ELSE
      o_status      := false;
      o_status_hint := FORMAT('failed to update picndoku with pd_id = %s', a_pd_id);
    END IF;

    RETURN NEXT;
    RETURN;
  END;
  $$ LANGUAGE plpgsql;

--------------------------------------------------------------------------------
-- task that extracts the text from a pdf-files within dms-root
--   # !!! calling the function with different a_root at same time DOES NOT WORK !!!
--   # uses GD['dms__pdf__task__init_python_helper']
--   # the function can be called from different sessions with the same a_root
--   # as long as a_restart_forced = false, all calls will work cooperatively
--   # at start: a list of all sub-directories in reverse order will be generated
--     all calls will then use this list,
--     to check for pdf-files within them and extract the text from them
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__extract_text(
    IN a_root            varchar,                  -- a DMS root directory, with subfolders for every year and there subfolders for every month
    IN a_restart_forced  boolean DEFAULT false,    -- when 'creating' a new task must be true, it will then overwrite already existing progress status
    IN a_restart_ifdone  boolean DEFAULT false,    -- if no progress status found, create new and start
    IN a_abortcond_paths integer DEFAULT -1,       -- ignore how many paths were processed
    IN a_abortcond_files integer DEFAULT -1,       -- after processing this many files, stop

    OUT o_status      boolean, -- status of the operation
    OUT o_status_hint text,    -- hint regarding status
    OUT o_file_path   varchar, -- full path to file
    OUT o_file_name   varchar, -- base name of file (without - .pdf - extension)
    OUT o_file_text   text     -- text extracted from file
  )
  RETURNS SETOF RECORD AS
  $$
    import time
    import os
    import json
    import pathlib

    if ('dms__pdf__task__init_python_helper' not in GD):
      plpy.execute("SELECT TSystem.dms__pdf__task__init_python_helper();")
    _helper = GD['dms__pdf__task__init_python_helper']

    _current_folder = _helper.get_next_folder_to_process()
    if (a_restart_forced or ((_current_folder == '') and a_restart_ifdone)):
      if (_helper.trylock()):
        try:
          _folders = []
          plpy.notice(f"Start processing: {a_root}")
          for _folder in _helper.get_folders(a_root, 0):
            _path = pathlib.PureWindowsPath(_folder)
            _path_posix = str(_path.as_posix())
            _folders.append(_path_posix)

          _folders.sort(reverse = True) # -- process newest first
          _helper.set_status(json.dumps(_folders, sort_keys=False))
        except Exception as e:
          raise e
        finally:
          _helper.unlock()

      _current_folder = _helper.get_next_folder_to_process()
    else:
      plpy.notice('Continue processing:')

    _count_files = 0
    _count_paths = 0
    while (not(_current_folder == '')):
      _count_paths += 1

      plpy.notice("  processing {}".format(_current_folder))

      for _filename in _helper.get_pdffiles(_current_folder):
        plpy.notice("    {}".format(_filename))
        _count_files += 1
        _ret = _helper.extract_text_from_pdf_filename(_filename)
        yield (_ret[0], _ret[1], os.path.dirname(_filename), pathlib.Path(_filename).stem, _ret[2])

      if ((a_abortcond_paths > 0) and (_count_paths >= a_abortcond_paths)):
        plpy.notice(f'    task processing stopped because of path abort ({_count_paths}/{a_abortcond_paths}) condition reached. continue by calling task again.')
        break

      if ((a_abortcond_files > 0) and (_count_files >= a_abortcond_files)):
        plpy.notice(f'    task processing stopped because of file abort({_count_files}/{a_abortcond_files}) condition reached. continue by calling task again.')
        break

      _current_folder = _helper.get_next_folder_to_process()

    if (_current_folder == ''):
      if (_count_files == 0):
        plpy.notice('>> nothing to do. restart task')
      else:
        plpy.notice('>> processing finished')
  $$ LANGUAGE plpython3u;

--------------------------------------------------------------------------------
-- wrapper arround TSystem.dms__pdf__task__extract_text (!!! SEE THERE !!!) to update picndoku.pd_volltext
--   # call first with a_restart_forced = true and reasonable parameters !!!
--   # a_abortcond_XXX can be used to break down the task into several smaller calls (time)
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__update_volltext(
    IN a_root            varchar,                  -- a DMS root directory, with subfolders for every year and there subfolders for every month
    IN a_restart_forced  boolean DEFAULT false,    -- when 'creating' a new task must be true, it will then overwrite already existing progress status
    IN a_restart_ifdone  boolean DEFAULT false,    -- if no progress status found, create new and start
    IN a_abortcond_paths integer DEFAULT -1,       -- ignore how many paths were processed
    IN a_abortcond_files integer DEFAULT -1        -- after processing this many files, stop
  )
  RETURNS integer AS
  $$
  DECLARE
    _rec record;
    _count integer;

  BEGIN
    _count := 0;

    FOR _rec IN
      SELECT (TSystem.dms__pdf__task__extract_text(a_root, a_restart_forced, a_restart_ifdone, a_abortcond_paths, a_abortcond_files)).*
    LOOP
      _count := _count + 1;

      IF _rec.o_status THEN
        UPDATE
          picndoku
        SET
          pd_volltext = _rec.o_file_text,
          pd_volltext_status = null
        WHERE
          picndoku.pd_volltext IS null AND (LOWER(picndoku.pd_dmsremotefile) = LOWER(concat(_rec.o_file_name::varchar, '.pdf')))
        ;
      ELSE
        UPDATE
          picndoku
        SET
          pd_volltext_status = jsonb_build_object('status', _rec.o_status, 'hint', _rec.o_status_hint)
        WHERE
          picndoku.pd_volltext IS null AND (LOWER(picndoku.pd_dmsremotefile) = LOWER(concat(_rec.o_file_name::varchar, '.pdf')))
        ;
      END IF;
    END LOOP;

    RETURN _count;
  END;
  $$ LANGUAGE plpgsql;

--------------------------------------------------------------------------------
-- lists grouped by folder, number of empty docs
CREATE OR REPLACE FUNCTION TSystem.dms__pdf__task__check_volltext()
  RETURNS TABLE(filecount INTEGER, filepath varchar) AS
  $$
    WITH
      _src AS
      (
        SELECT
          TDMS.External_DMS__get_RemoteFilename(pd_id, false) AS filename,
          (SELECT substring(TDMS.External_DMS__get_RemoteFilename(pd_id, false) FROM '(.*(\\\\|/)).*$')) AS filepath,
          (SELECT substring(TDMS.External_DMS__get_RemoteFilename(pd_id, false) FROM '\\.(.*)$')) AS fileext
        FROM
          picndoku
        WHERE
          pd_volltext IS NULL
      ),
      _src_pdf AS
      (
        SELECT * FROM _src WHERE fileext ilike 'pdf'
      )
    SELECT
      count(*) as filecount,
      filepath
    FROM
      _src_pdf
    GROUP BY
      filepath
    ORDER BY
      filecount DESC
    ;
  $$ LANGUAGE sql;

--------------------------------------------------------------------------------
-- trigger function on picndoku.pd_dms_delayed_info, to set pd_volltext
CREATE OR REPLACE FUNCTION picndoku__b90_iu__delayed_info()
  RETURNS TRIGGER AS
  $$
  DECLARE
    _rec      picndoku DEFAULT null;
    _du_bytea bytea;
    _success  boolean;
    _opmsg    varchar;
    _volltext varchar;

  BEGIN
    BEGIN
      _rec := new;

      IF (TG_OP = 'UPDATE') THEN
        IF new.pd_dms_delayed_info IS NULL THEN
          _rec := old;
        END IF;
      END IF;

      IF (_rec.pd_dms_delayed_info IS NOT NULL) THEN
        IF (LOWER(substring((_rec.pd_dmsremotefile)::varchar from e'\.([^\.]*)$')) = 'pdf') THEN
          IF new.pd_volltext IS NULL THEN
            _du_bytea := (tsystem.picndoku__delayed_upload__get(_rec, false))._du_bytea;

            IF (_du_bytea IS NOT NULL) THEN
              SELECT
                o_status,
                o_status_hint,
                o_file_text
              FROM
                TSystem.dms__pdf__extract_text(_du_bytea)
              INTO
                _success,
                _opmsg,
                _volltext
              ;

              if (_success) THEN
                new.pd_volltext := _volltext;
                new.pd_volltext_status := null;
              ELSE
                new.pd_volltext_status := jsonb_build_object('status', _success, 'hint',   _opmsg);
              END IF;
            ELSE
              new.pd_volltext_status := jsonb_build_object('status', false, 'hint', 'retrieving data (bytea) failed');
            END IF;
          END IF;
        END IF;
      END IF;
    EXCEPTION
      WHEN OTHERS THEN
        PERFORM TSystem.LogError((SQLERRM)::varchar, '', 'picndoku__b90_iu__delayed_info');
    END;

    RETURN new;
  END;
  $$ LANGUAGE plpgsql;

CREATE TRIGGER picndoku__b90_i__delayed_info
  BEFORE INSERT
  ON picndoku
  FOR EACH ROW
  WHEN ( not(coalesce(new.pd_dms_delayed_info, '') = '') )
  EXECUTE PROCEDURE picndoku__b90_iu__delayed_info();

CREATE TRIGGER picndoku__b90_u__delayed_info
  BEFORE UPDATE
  ON picndoku
  FOR EACH ROW
  WHEN ( (not(coalesce(old.pd_dms_delayed_info, '') = coalesce(new.pd_dms_delayed_info, ''))) )
  EXECUTE PROCEDURE picndoku__b90_iu__delayed_info();

